This document looks at the effect that modifying the values of individial parameters to the Random Forrest algorithm in sklearn has on its accuracy and time taken to train the model.
Best results were when we used the following settings:
sample_size=9000
random_state=303
and all the other arguments left to their defaults. This resulted in an out of sample accuracy of 0.963111111111
If we were to keep the sample size at the default value of 1000, then the greatest results were obtained when the following settings were used:
random_state: 3599
And all the other arguments left to their defaults. This resulted in an out of sample accuracy: 0.944
import numpy as np
from rf_tester import *
from parameter_plots import *
from prep_terrain_data import makeTerrainData
from bokeh.io import output_notebook
output_notebook()
data_generator = makeTerrainData
sample_size = 1000
random_state = 3599
default_n_estimators = 10
default_criterion = "gini"
default_max_depth = None
default_min_samples_split = 2
default_min_samples_leaf = 1
default_min_weight_fraction_leaf = 0.0
default_max_features = 'auto'
default_max_leaf_nodes = None
default_bootstrap = True
default_oob_score = False
default_warm_start = False
default_class_weight = None
seeds = range(1,10000)
seed_results = loop_rf(data_generator,
sample_size=sample_size,
n_estimators = default_n_estimators,
criterion = default_criterion,
max_depth = default_max_depth,
min_samples_split = default_min_samples_split,
min_samples_leaf = default_min_samples_leaf,
min_weight_fraction_leaf = default_min_weight_fraction_leaf,
max_features = default_max_features,
max_leaf_nodes = default_max_leaf_nodes,
bootstrap = default_bootstrap,
oob_score = default_oob_score,
random_state = seeds,
warm_start = default_warm_start,
class_weight = default_class_weight
)
parameter_plots(seeds, results_dict=seed_results,
x_label="Seed #",
title_accuracy="Seed # vs Accuracy",
title_time="Seed # vs Training Time",
legend_pos="left_center")
sample_sizes = range(1000, 30000+1, 1000) + \
range(35000, 60000+1, 5000) + \
range(70000, 100000+1, 10000) + \
[150000, 200000]
sample_sizes_results = loop_rf(data_generator,
sample_size=sample_sizes,
n_estimators = default_n_estimators,
criterion = default_criterion,
max_depth = default_max_depth,
min_samples_split = default_min_samples_split,
min_samples_leaf = default_min_samples_leaf,
min_weight_fraction_leaf = default_min_weight_fraction_leaf,
max_features = default_max_features,
max_leaf_nodes = default_max_leaf_nodes,
bootstrap = default_bootstrap,
oob_score = default_oob_score,
random_state = 303,
warm_start = default_warm_start,
class_weight = default_class_weight
)
scaled_sample_sizes = np.array(sample_sizes) / 1000.0
parameter_plots(scaled_sample_sizes, results_dict=sample_sizes_results,
x_label="Sample Size (in thousands)",
title_accuracy="Sample Size vs Accuracy",
title_time="Sample Size vs Training Time",
legend_pos="bottom_right")
criteria = ["gini", "entropy"]
criteria_results = loop_rf(data_generator,
sample_size=sample_size,
n_estimators = default_n_estimators,
criterion = criteria,
max_depth = default_max_depth,
min_samples_split = default_min_samples_split,
min_samples_leaf = default_min_samples_leaf,
min_weight_fraction_leaf = default_min_weight_fraction_leaf,
max_features = default_max_features,
max_leaf_nodes = default_max_leaf_nodes,
bootstrap = default_bootstrap,
oob_score = default_oob_score,
random_state = random_state,
warm_start = default_warm_start,
class_weight = default_class_weight
)
max_depths = range(1,30) + range(35,60,5) + range(70, 200, 10)
max_depths_results = loop_rf(data_generator,
sample_size=sample_size,
n_estimators = default_n_estimators,
criterion = default_criterion,
max_depth = max_depths,
min_samples_split = default_min_samples_split,
min_samples_leaf = default_min_samples_leaf,
min_weight_fraction_leaf = default_min_weight_fraction_leaf,
max_features = default_max_features,
max_leaf_nodes = default_max_leaf_nodes,
bootstrap = default_bootstrap,
oob_score = default_oob_score,
random_state = random_state,
warm_start = default_warm_start,
class_weight = default_class_weight
)
parameter_plots(max_depths, results_dict=max_depths_results,
x_label="Max Depth",
title_accuracy="Max Depth vs Accuracy",
title_time="Max Depth vs Training Time",
legend_pos="bottom_right")
min_samples_splits = range(2, 100)
min_samples_split_results = loop_rf(data_generator,
sample_size=sample_size,
n_estimators = default_n_estimators,
criterion = default_criterion,
max_depth = default_max_depth,
min_samples_split = min_samples_splits,
min_samples_leaf = default_min_samples_leaf,
min_weight_fraction_leaf = default_min_weight_fraction_leaf,
max_features = default_max_features,
max_leaf_nodes = default_max_leaf_nodes,
bootstrap = default_bootstrap,
oob_score = default_oob_score,
random_state = random_state,
warm_start = default_warm_start,
class_weight = default_class_weight
)
parameter_plots(min_samples_splits, results_dict=min_samples_split_results,
x_label="min samples splits",
title_accuracy="min samples splits vs Accuracy",
title_time="min samples splits vs Training Time",
legend_pos="top_right")
min_samples_leafs = range(1, 100)
min_samples_leaf_results = loop_rf(data_generator,
sample_size=sample_size,
n_estimators = default_n_estimators,
criterion = default_criterion,
max_depth = default_max_depth,
min_samples_split = default_min_samples_split,
min_samples_leaf = min_samples_leafs,
min_weight_fraction_leaf = default_min_weight_fraction_leaf,
max_features = default_max_features,
max_leaf_nodes = default_max_leaf_nodes,
bootstrap = default_bootstrap,
oob_score = default_oob_score,
random_state = random_state,
warm_start = default_warm_start,
class_weight = default_class_weight
)
parameter_plots(min_samples_leafs, results_dict=min_samples_leaf_results,
x_label="min_samples_leafs",
title_accuracy="min_samples_leafs vs Accuracy",
title_time="min_samples_leafs vs Training Time",
legend_pos="top_right")
max_leaf_nodes = range(2,300)
max_leaf_nodes_results = loop_rf(data_generator,
sample_size=sample_size,
n_estimators = default_n_estimators,
criterion = default_criterion,
max_depth = default_max_depth,
min_samples_split = default_min_samples_split,
min_samples_leaf = default_min_samples_leaf,
min_weight_fraction_leaf = default_min_weight_fraction_leaf,
max_features = default_max_features,
max_leaf_nodes = max_leaf_nodes,
bootstrap = default_bootstrap,
oob_score = default_oob_score,
random_state = random_state,
warm_start = default_warm_start,
class_weight = default_class_weight
)
parameter_plots(max_leaf_nodes, results_dict=max_leaf_nodes_results,
x_label="max_leaf_nodes",
title_accuracy="max_leaf_nodes vs Accuracy",
title_time="max_leaf_nodes vs Training Time",
legend_pos="top_right")
bootstraps = [False, True]
bootstrap_results = loop_rf(data_generator,
sample_size=sample_size,
n_estimators = default_n_estimators,
criterion = default_criterion,
max_depth = default_max_depth,
min_samples_split = default_min_samples_split,
min_samples_leaf = default_min_samples_leaf,
min_weight_fraction_leaf = default_min_weight_fraction_leaf,
max_features = default_max_features,
max_leaf_nodes = default_max_leaf_nodes,
bootstrap = bootstraps,
oob_score = default_oob_score,
random_state = random_state,
warm_start = default_warm_start,
class_weight = default_class_weight
)
parameter_plots(bootstraps, results_dict=bootstrap_results,
x_label="bootstrap",
title_accuracy="bootstrap vs Accuracy",
title_time="bootstrap vs Training Time",
legend_pos="left_center")
oobs = [False, True]
oob_results = loop_rf(data_generator,
sample_size=sample_size,
n_estimators = default_n_estimators,
criterion = default_criterion,
max_depth = default_max_depth,
min_samples_split = default_min_samples_split,
min_samples_leaf = default_min_samples_leaf,
min_weight_fraction_leaf = default_min_weight_fraction_leaf,
max_features = default_max_features,
max_leaf_nodes = default_max_leaf_nodes,
bootstrap = default_bootstrap,
oob_score = oobs,
random_state = random_state,
warm_start = default_warm_start,
class_weight = default_class_weight
)
parameter_plots(oobs, results_dict=oob_results,
x_label="Using Out of bag Error Estimate",
title_accuracy="Using Out of bag Error Estimate vs Accuracy",
title_time="Using Out of bag Error Estimate vs Training Time",
legend_pos="top_right")
warm_starts = [False, True]
warm_start_results = loop_rf(data_generator,
sample_size=sample_size,
n_estimators = default_n_estimators,
criterion = default_criterion,
max_depth = default_max_depth,
min_samples_split = default_min_samples_split,
min_samples_leaf = default_min_samples_leaf,
min_weight_fraction_leaf = default_min_weight_fraction_leaf,
max_features = default_max_features,
max_leaf_nodes = default_max_leaf_nodes,
bootstrap = default_bootstrap,
oob_score = default_oob_score,
random_state = random_state,
warm_start = warm_starts,
class_weight = default_class_weight
)
parameter_plots(warm_starts, results_dict=warm_start_results,
x_label="Warm Start",
title_accuracy="Warm Start vs Accuracy",
title_time="Warm Start vs Training Time",
legend_pos="right_center")
class_weights = ["balanced", "balanced_subsample", None]
class_weight_results = loop_rf(data_generator,
sample_size=sample_size,
n_estimators = default_n_estimators,
criterion = default_criterion,
max_depth = default_max_depth,
min_samples_split = default_min_samples_split,
min_samples_leaf = default_min_samples_leaf,
min_weight_fraction_leaf = default_min_weight_fraction_leaf,
max_features = default_max_features,
max_leaf_nodes = default_max_leaf_nodes,
bootstrap = default_bootstrap,
oob_score = default_oob_score,
random_state = random_state,
warm_start = default_warm_start,
class_weight = class_weights
)